In [1]:
import os
import plotly.io as pio
import helpsk as hlp

pio.renderers.default='notebook'

def get_project_directory():
    return os.getcwd().\
        replace('/develop', '').\
        replace('/deliver', '').\
        replace('/archive', '').\
        replace('/code/notebooks', '')

Results¶

In [2]:
file_name = os.path.join(get_project_directory(), 'artifacts/models/experiments', 'multi-model-BayesSearchCV-2022-03-07-20-09-07.yaml')
In [3]:
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = file_name)

Best Scores/Params¶

In [4]:
results.best_score
Out[4]:
0.7668027026011365
In [5]:
results.best_params
Out[5]:
{'model': 'RandomForestClassifier()',
 'imputer': 'SimpleImputer()',
 'scaler': 'None',
 'pca': 'None',
 'encoder': 'OneHotEncoder()'}
In [6]:
# Best model from each model-type.
df = results.to_formatted_dataframe(return_style=False, include_rank=True)
df["model_rank"] = df.groupby("model")["roc_auc Mean"].rank(method="first", ascending=False)
df.query('model_rank == 1')
Out[6]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI model C max_features max_depth n_estimators min_samples_split ... subsample colsample_bytree colsample_bylevel reg_alpha reg_lambda imputer scaler pca encoder model_rank
23 1 0.767 0.720 0.814 RandomForestClassifier() NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN SimpleImputer() None None OneHotEncoder() 1.0
5 2 0.763 0.725 0.802 LogisticRegression() NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN SimpleImputer() StandardScaler() None OneHotEncoder() 1.0
7 3 0.761 0.720 0.803 LinearSVC() 0.280746 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN SimpleImputer(strategy='most_frequent') MinMaxScaler() PCA('mle') OneHotEncoder() 1.0
16 5 0.760 0.701 0.819 ExtraTreesClassifier() NaN 0.68466 30.0 1659.0 25.0 ... NaN NaN NaN NaN NaN SimpleImputer() None PCA('mle') OneHotEncoder() 1.0
28 9 0.753 0.710 0.796 XGBClassifier() NaN NaN 5.0 1246.0 NaN ... 0.95619 0.694741 0.518639 0.242199 1.220693 SimpleImputer(strategy='median') None None OneHotEncoder() 1.0

5 rows × 25 columns

In [7]:
results.to_formatted_dataframe(return_style=True,
                               include_rank=True,
                               num_rows=1000)
Out[7]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI model C max_features max_depth n_estimators min_samples_split min_samples_leaf max_samples criterion learning_rate min_child_weight subsample colsample_bytree colsample_bylevel reg_alpha reg_lambda imputer scaler pca encoder
1 0.767 0.720 0.814 RandomForestClassifier() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None None OneHotEncoder()
2 0.763 0.725 0.802 LogisticRegression() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() StandardScaler() None OneHotEncoder()
3 0.761 0.720 0.803 LinearSVC() 0.281 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') MinMaxScaler() PCA('mle') OneHotEncoder()
4 0.761 0.697 0.825 LogisticRegression() 0.001 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') MinMaxScaler() None OneHotEncoder()
5 0.760 0.701 0.819 ExtraTreesClassifier() <NA> 0.685 30.000 1,659.000 25.000 11.000 0.781 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None PCA('mle') OneHotEncoder()
6 0.757 0.711 0.803 ExtraTreesClassifier() <NA> 0.681 38.000 1,461.000 23.000 10.000 0.553 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') None None CustomOrdinalEncoder()
7 0.755 0.714 0.796 RandomForestClassifier() <NA> 0.599 70.000 1,858.000 39.000 22.000 0.851 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None None OneHotEncoder()
8 0.753 0.716 0.791 RandomForestClassifier() <NA> 0.303 81.000 1,063.000 15.000 27.000 0.502 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') None None OneHotEncoder()
9 0.753 0.710 0.796 XGBClassifier() <NA> <NA> 5.000 1,246.000 <NA> <NA> <NA> <NA> 0.023 15.000 0.956 0.695 0.519 0.242 1.221 SimpleImputer(strategy='median') None None OneHotEncoder()
10 0.752 0.698 0.805 ExtraTreesClassifier() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None None OneHotEncoder()
11 0.751 0.721 0.781 LinearSVC() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() StandardScaler() None OneHotEncoder()
12 0.749 0.698 0.801 XGBClassifier() <NA> <NA> 1.000 1,974.000 <NA> <NA> <NA> <NA> 0.024 4.000 0.543 0.620 0.876 0.034 1.445 SimpleImputer() None PCA('mle') OneHotEncoder()
13 0.749 0.706 0.792 ExtraTreesClassifier() <NA> 0.408 87.000 1,423.000 25.000 19.000 0.989 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None PCA('mle') CustomOrdinalEncoder()
14 0.747 0.694 0.799 ExtraTreesClassifier() <NA> 0.710 15.000 1,493.000 33.000 27.000 0.914 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None PCA('mle') OneHotEncoder()
15 0.746 0.716 0.776 LogisticRegression() 23.327 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') StandardScaler() None OneHotEncoder()
16 0.745 0.704 0.786 RandomForestClassifier() <NA> 0.762 88.000 1,235.000 8.000 7.000 0.666 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') None PCA('mle') CustomOrdinalEncoder()
17 0.744 0.709 0.779 RandomForestClassifier() <NA> 0.567 38.000 1,060.000 19.000 41.000 0.656 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None PCA('mle') OneHotEncoder()
18 0.742 0.686 0.798 XGBClassifier() <NA> <NA> 10.000 1,146.000 <NA> <NA> <NA> <NA> 0.025 14.000 0.771 0.548 0.748 0.093 1.892 SimpleImputer(strategy='median') None PCA('mle') OneHotEncoder()
19 0.738 0.686 0.790 XGBClassifier() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None None OneHotEncoder()
20 0.736 0.695 0.777 ExtraTreesClassifier() <NA> 0.740 14.000 1,645.000 5.000 43.000 0.741 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None PCA('mle') CustomOrdinalEncoder()
21 0.734 0.695 0.773 RandomForestClassifier() <NA> 0.770 70.000 1,570.000 16.000 39.000 0.910 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None None CustomOrdinalEncoder()
22 0.730 0.702 0.758 LogisticRegression() 0.000 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') StandardScaler() None CustomOrdinalEncoder()
23 0.727 0.690 0.765 LinearSVC() 0.361 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') MinMaxScaler() PCA('mle') CustomOrdinalEncoder()
24 0.727 0.689 0.765 LinearSVC() 0.746 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') MinMaxScaler() None CustomOrdinalEncoder()
25 0.727 0.692 0.762 LogisticRegression() 3.489 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') StandardScaler() PCA('mle') CustomOrdinalEncoder()
26 0.726 0.697 0.755 LogisticRegression() 0.000 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') StandardScaler() PCA('mle') CustomOrdinalEncoder()
27 0.717 0.676 0.758 XGBClassifier() <NA> <NA> 13.000 1,153.000 <NA> <NA> <NA> <NA> 0.026 3.000 0.685 0.549 0.802 0.016 2.353 SimpleImputer() None PCA('mle') CustomOrdinalEncoder()
28 0.714 0.679 0.749 XGBClassifier() <NA> <NA> 4.000 1,181.000 <NA> <NA> <NA> <NA> 0.067 7.000 0.557 0.763 0.592 0.001 2.984 SimpleImputer(strategy='median') None PCA('mle') CustomOrdinalEncoder()
29 0.701 0.669 0.733 LinearSVC() 10.021 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() MinMaxScaler() PCA('mle') CustomOrdinalEncoder()
30 0.660 0.610 0.710 LinearSVC() 0.000 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') MinMaxScaler() None OneHotEncoder()
In [8]:
results.to_formatted_dataframe(query='model == "RandomForestClassifier()"', include_rank=True)
Out[8]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI max_features max_depth n_estimators min_samples_split min_samples_leaf max_samples criterion imputer pca encoder
1 0.767 0.720 0.814 <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None OneHotEncoder()
2 0.755 0.714 0.796 0.599 70.000 1,858.000 39.000 22.000 0.851 gini SimpleImputer(strategy='most_frequent') None OneHotEncoder()
3 0.753 0.716 0.791 0.303 81.000 1,063.000 15.000 27.000 0.502 gini SimpleImputer(strategy='median') None OneHotEncoder()
4 0.745 0.704 0.786 0.762 88.000 1,235.000 8.000 7.000 0.666 gini SimpleImputer(strategy='median') PCA('mle') CustomOrdinalEncoder()
5 0.744 0.709 0.779 0.567 38.000 1,060.000 19.000 41.000 0.656 entropy SimpleImputer() PCA('mle') OneHotEncoder()
6 0.734 0.695 0.773 0.770 70.000 1,570.000 16.000 39.000 0.910 entropy SimpleImputer(strategy='most_frequent') None CustomOrdinalEncoder()
In [9]:
results.to_formatted_dataframe(query='model == "LogisticRegression()"', include_rank=True)
Out[9]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI C imputer scaler pca encoder
1 0.763 0.725 0.802 <NA> SimpleImputer() StandardScaler() None OneHotEncoder()
2 0.761 0.697 0.825 0.001 SimpleImputer(strategy='median') MinMaxScaler() None OneHotEncoder()
3 0.746 0.716 0.776 23.327 SimpleImputer(strategy='median') StandardScaler() None OneHotEncoder()
4 0.730 0.702 0.758 0.000 SimpleImputer(strategy='median') StandardScaler() None CustomOrdinalEncoder()
5 0.727 0.692 0.762 3.489 SimpleImputer(strategy='median') StandardScaler() PCA('mle') CustomOrdinalEncoder()
6 0.726 0.697 0.755 0.000 SimpleImputer(strategy='median') StandardScaler() PCA('mle') CustomOrdinalEncoder()

BayesSearchCV Performance Over Time¶

In [10]:
results.plot_performance_across_trials(facet_by='model').show()
In [11]:
results.plot_performance_across_trials(query='model == "RandomForestClassifier()"').show()

Variable Performance Over Time¶

In [12]:
results.plot_parameter_values_across_trials(query='model == "RandomForestClassifier()"').show()

Scatter Matrix¶

In [13]:
# results.plot_scatter_matrix(query='model == "RandomForestClassifier()"',
#                             height=1000, width=1000).show()

Variable Performance - Numeric¶

In [14]:
results.plot_performance_numeric_params(query='model == "RandomForestClassifier()"',
                                        height=800)
In [15]:
results.plot_parallel_coordinates(query='model == "RandomForestClassifier()"').show()

Variable Performance - Non-Numeric¶

In [16]:
results.plot_performance_non_numeric_params(query='model == "RandomForestClassifier()"').show()

In [17]:
results.plot_score_vs_parameter(
    query='model == "RandomForestClassifier()"',
    parameter='max_features',
    size='max_depth',
    color='encoder',
)

In [18]:
# results.plot_parameter_vs_parameter(
#     query='model == "XGBClassifier()"',
#     parameter_x='colsample_bytree',
#     parameter_y='learning_rate',
#     size='max_depth'
# )
In [19]:
# results.plot_parameter_vs_parameter(
#     query='model == "XGBClassifier()"',
#     parameter_x='colsample_bytree',
#     parameter_y='learning_rate',
#     size='imputer'
# )